10 years ago · e791f75648
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -33,7 +33,7 @@ module Agents
 
                 
              
 
                       "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`.
              
 
                 
              
 
                -      Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document.
              
 
                +      Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document unless a toplevel option `use_namespaces` is set to true.
              
 
                 
              
 
                       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
              
 
                 
              
@@ -302,9 +302,13 @@ module Agents
 
                     end
              
 
                 
              
 
                     def use_namespaces?
              
 
                -      interpolated['extract'].none? { |name, extraction_details|
              
 
                -        extraction_details.key?('xpath')
              
 
                -      }
              
 
                +      if value = interpolated.key?('use_namespaces')
              
 
                +        boolify(interpolated['use_namespaces'])
              
 
                +      else
              
 
                +        interpolated['extract'].none? { |name, extraction_details|
              
 
                +          extraction_details.key?('xpath')
              
 
                +        }
              
 
                +      end
              
 
                     end
              
 
                 
              
 
                     def extract_each(&block)
              
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -401,6 +401,28 @@ describe Agents::WebsiteAgent do
 
                           expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
              
 
                         end
              
 
                 
              
 
                +        it "works with XPath with namespaces unstripped" do
              
 
                +          @checker.options['use_namespaces'] = 'true'
              
 
                +          @checker.save!
              
 
                +          expect {
              
 
                +            @checker.check
              
 
                +          }.to change { Event.count }.by(0)
              
 
                +
              
 
                +          @checker.options['extract'] = {
              
 
                +            'title' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => 'normalize-space(./xmlns:title)' },
              
 
                +            'url' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './xmlns:link[1]/@href' },
              
 
                +            'thumbnail' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './media:thumbnail/@url' },
              
 
                +          }
              
 
                +          @checker.save!
              
 
                +          expect {
              
 
                +            @checker.check
              
 
                +          }.to change { Event.count }.by(20)
              
 
                +          event = Event.last
              
 
                +          expect(event.payload['title']).to eq('Shift to dev group')
              
 
                +          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
              
 
                +          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
              
 
                +        end
              
 
                +
              
 
                         it "works with CSS selectors" do
              
 
                           @checker.options['extract'] = {
              
 
                             'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
              
@@ -429,6 +451,23 @@ describe Agents::WebsiteAgent do
 
                           expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
              
 
                           expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
              
 
                         end
              
 
                +
              
 
                +        it "works with CSS selectors with namespaces stripped" do
              
 
                +          @checker.options['extract'] = {
              
 
                +            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
              
 
                +            'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' },
              
 
                +            'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' },
              
 
                +          }
              
 
                +          @checker.options['use_namespaces'] = 'false'
              
 
                +          @checker.save!
              
 
                +          expect {
              
 
                +            @checker.check
              
 
                +          }.to change { Event.count }.by(20)
              
 
                +          event = Event.last
              
 
                +          expect(event.payload['title']).to eq('Shift to dev group')
              
 
                +          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
              
 
                +          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
              
 
                +        end
              
 
                       end
              
 
                 
              
 
                       describe "JSON" do